Parameters that might affect performance

This notebook examines how parameters in the semantic model of the Danish language affects its performance.

  • Number of pages read
  • Use of stopwords
  • Exclusion of short pages
  • Scaling of matrix tfidf/count
  • Normalization of document
  • Factorization of matrix

In [1]:
from everything import *
from dasem.semantic import Semantic
from dasem.data import wordsim353 as wordsim353_data

In [2]:
# Read datasets
four_words = read_csv('../dasem/data/four_words.csv', encoding='utf-8')
wordsim353 = wordsim353_data()

In [3]:
def compute_accuracy(semantic, four_words):
    outlier = []
    for idx, words in four_words.iterrows():
        sorted_words = semantic.sort_by_outlierness(words.values[:4])
        outlier.append(sorted_words[0])

    accuracy = mean(four_words.word4 == outlier)
    return accuracy

In [4]:
def compute_correlation(semantic, wordsim):
    human = []
    relatednesses = []
    for idx, row in wordsim.iterrows():
        R = semantic.relatedness([row.da1, row.da2])
        relatednesses.append(R[0, 1])
        human.append(row['Human (mean)'])
    human = array(human)
    relatednesses = array(relatednesses)
    indices = (~isnan(relatednesses)).nonzero()[0]
    C = corrcoef(human[indices], relatednesses[indices])
    return C[0, 1]

In [5]:
max_n_pagess = [3000, 30000, None]
norms = ['l1', 'l2', None]
stop_wordss = [None, set(nltk.corpus.stopwords.words('danish'))]
use_idfs = [True, False]
sublinear_tfs = [True, False]

columns = ['accuracy', 'correlation', 'stop_words', 'use_idf', 'norm', 'sublinear_tf', 'max_n_pages']

n_total = len(max_n_pagess) * len(norms) * len(stop_wordss) * len(use_idfs) * \
    len(sublinear_tfs)
results = DataFrame(dtype=float, index=range(n_total), columns=columns)

n = 0
for stop_words_index, stop_words in (enumerate(stop_wordss)):
    for norm in (norms):
        for use_idf in (use_idfs):
            for sublinear_tf in (sublinear_tfs):
                for max_n_pages in (max_n_pagess):
                    results.ix[n, 'max_n_pages'] = max_n_pages
                    results.ix[n, 'stop_words'] = stop_words_index
                    results.ix[n, 'norm'] = str(norm)
                    results.ix[n, 'use_idf'] = use_idf
                    results.ix[n, 'sublinear_tf'] = sublinear_tf
                    semantic = Semantic(stop_words=stop_words, norm=norm,
                                        use_idf=use_idf, sublinear_tf=sublinear_tf,
                                        max_n_pages=max_n_pages)
                    results.ix[n, 'accuracy'] = compute_accuracy(semantic, four_words)
                    results.ix[n, 'correlation'] = compute_correlation(semantic, wordsim353)
                    n += 1

In [6]:
relatednesses = []
for idx, row in wordsim353.iterrows():
    R = semantic.relatedness([row.da1, row.da2])
    relatednesses.append(R[0, 1])
wordsim353['relatedness'] = relatednesses

In [7]:
wordsim353


Out[7]:
Word 1 da1 Word 2 da2 Human (mean) Problem relatedness
0 love kærlighed sex sex 6.77 NaN 0.069031
1 tiger tiger cat kat 7.35 NaN 0.024325
2 tiger tiger tiger tiger 10.00 NaN 1.000000
3 book bog paper papir 7.46 NaN 0.031266
4 computer computer keyboard tastatur 7.62 NaN 0.117331
5 computer computer internet internet 7.58 NaN 0.059367
6 plane fly car bil 5.77 NaN 0.013637
7 train tog car bil 6.31 NaN 0.026891
8 telephone telefon communication kommunikation 7.50 NaN 0.007303
9 television tv radio radio 6.77 NaN 0.164519
10 media medie radio radio 7.42 NaN 0.032748
11 drug narkotika abuse misbrug 6.85 NaN 0.074244
12 bread brød butter smør 6.19 NaN 0.075498
13 cucumber agurk potato kartoffel 5.92 NaN 0.070229
14 doctor læge nurse sygeplejerske 7.00 NaN 0.078341
15 professor professor doctor læge 6.62 NaN 0.135296
16 student studerende professor professor 6.81 NaN 0.087950
17 smart klog student studerende 4.62 NaN 0.007902
18 smart klog stupid dum 5.81 NaN 0.033734
19 company firma stock aktie 7.08 NaN 0.023840
20 stock aktie market marked 8.08 NaN 0.036268
21 stock aktie phone telefon 1.62 NaN 0.012286
22 stock aktie CD CD 1.31 NaN 0.000482
23 stock aktie jaguar jaguar 0.92 NaN 0.001671
24 stock aktie egg æg 1.81 NaN 0.014525
25 fertility frugtbar egg æg 6.69 NaN 0.019678
27 stock aktie life liv 0.92 NaN 0.009396
28 book bog library bibliotek 7.46 NaN 0.064770
29 bank bank money penge 8.12 NaN 0.121945
30 wood træ forest skov 7.73 NaN 0.040285
... ... ... ... ... ... ... ...
322 gender køn equality lighed 6.41 NaN 0.066694
323 change ændring attitude holdning 5.44 NaN 0.086168
324 family familie planning planlægning 6.25 NaN 0.003667
325 opera opera industry industri 2.63 NaN 0.007351
326 sugar sukker approach tilgang 0.88 NaN 0.012265
327 practice øvelse institution institution 3.19 NaN 0.005296
328 ministry ministerium culture kultur 4.69 NaN 0.031118
329 problem problem challenge udfordring 6.75 NaN 0.055777
330 size størrelse prominence fremtrædende 5.31 NaN 0.067736
331 country land citizen borger 7.31 NaN 0.018616
332 planet planet people folk 5.75 NaN 0.025209
333 development udvikling issue spørgsmål 3.97 NaN 0.129536
334 experience oplevelse music musik 3.47 NaN 0.045611
335 music musik project projekt 3.63 NaN 0.047501
336 glass glas metal metal 5.56 NaN 0.012299
337 aluminum aluminium metal metal 7.83 NaN 0.019734
338 chance chance credibility troværdighed 3.88 NaN 0.032975
340 concert koncert virtuoso virtuos 6.81 NaN 0.012196
341 rock rock jazz jazz 7.59 NaN 0.080152
342 museum museum theater teater 7.19 NaN 0.043194
343 observation observation architecture arkitektur 4.38 NaN 0.003176
344 space rum world verden 6.53 NaN 0.083000
345 preservation bevarelse world verden 6.19 NaN 0.023978
346 admission adgang ticket billet 7.69 NaN 0.042346
347 shower byge thunderstorm tordenbyge 6.31 NaN 0.000000
348 shower byge flood oversvømmelse 6.03 NaN 0.010859
349 weather vejr forecast vejrudsigt 8.34 NaN 0.062763
350 disaster katastrofe area område 6.25 NaN 0.086862
351 governor guvernør office kontor 6.34 NaN 0.022742
352 architecture arkitektur century århundrede 3.78 NaN 0.093821

319 rows × 7 columns


In [12]:
wordsim353.plot(x='Human (mean)', y='relatedness', kind='scatter')
yscale('log')
ylim(0.0001, 1)
title('Scatter plot of Wordsim353 data')
show()



In [9]:
results


Out[9]:
accuracy correlation stop_words use_idf norm sublinear_tf max_n_pages
0 0.36 0.274049 0.0 True l1 True 3000.0
1 0.56 0.210682 0.0 True l1 True 30000.0
2 0.72 0.135028 0.0 True l1 True NaN
3 0.34 0.292945 0.0 True l1 False 3000.0
4 0.60 0.216162 0.0 True l1 False 30000.0
5 0.74 0.137733 0.0 True l1 False NaN
6 0.38 0.279397 0.0 False l1 True 3000.0
7 0.56 0.214376 0.0 False l1 True 30000.0
8 0.72 0.138702 0.0 False l1 True NaN
9 0.36 0.292561 0.0 False l1 False 3000.0
10 0.58 0.216529 0.0 False l1 False 30000.0
11 0.72 0.140512 0.0 False l1 False NaN
12 0.38 0.351491 0.0 True l2 True 3000.0
13 0.64 0.328927 0.0 True l2 True 30000.0
14 0.84 0.261032 0.0 True l2 True NaN
15 0.38 0.331091 0.0 True l2 False 3000.0
16 0.58 0.321919 0.0 True l2 False 30000.0
17 0.78 0.254649 0.0 True l2 False NaN
18 0.38 0.350975 0.0 False l2 True 3000.0
19 0.58 0.334352 0.0 False l2 True 30000.0
20 0.78 0.268553 0.0 False l2 True NaN
21 0.40 0.324016 0.0 False l2 False 3000.0
22 0.60 0.299873 0.0 False l2 False 30000.0
23 0.78 0.242552 0.0 False l2 False NaN
24 0.40 0.334992 0.0 True None True 3000.0
25 0.58 0.378010 0.0 True None True 30000.0
26 0.76 0.365937 0.0 True None True NaN
27 0.44 0.311267 0.0 True None False 3000.0
28 0.56 0.353467 0.0 True None False 30000.0
29 0.72 0.328984 0.0 True None False NaN
... ... ... ... ... ... ... ...
42 0.38 0.279397 1.0 False l1 True 3000.0
43 0.56 0.214376 1.0 False l1 True 30000.0
44 0.72 0.138702 1.0 False l1 True NaN
45 0.36 0.292561 1.0 False l1 False 3000.0
46 0.58 0.216529 1.0 False l1 False 30000.0
47 0.72 0.140512 1.0 False l1 False NaN
48 0.38 0.351491 1.0 True l2 True 3000.0
49 0.64 0.328927 1.0 True l2 True 30000.0
50 0.84 0.261032 1.0 True l2 True NaN
51 0.38 0.331091 1.0 True l2 False 3000.0
52 0.58 0.321919 1.0 True l2 False 30000.0
53 0.78 0.254649 1.0 True l2 False NaN
54 0.38 0.350975 1.0 False l2 True 3000.0
55 0.58 0.334352 1.0 False l2 True 30000.0
56 0.78 0.268553 1.0 False l2 True NaN
57 0.40 0.324016 1.0 False l2 False 3000.0
58 0.60 0.299873 1.0 False l2 False 30000.0
59 0.78 0.242552 1.0 False l2 False NaN
60 0.40 0.334992 1.0 True None True 3000.0
61 0.58 0.378010 1.0 True None True 30000.0
62 0.76 0.365937 1.0 True None True NaN
63 0.44 0.311267 1.0 True None False 3000.0
64 0.56 0.353467 1.0 True None False 30000.0
65 0.72 0.328984 1.0 True None False NaN
66 0.38 0.334995 1.0 False None True 3000.0
67 0.60 0.377975 1.0 False None True 30000.0
68 0.74 0.365990 1.0 False None True NaN
69 0.42 0.311267 1.0 False None False 3000.0
70 0.56 0.353424 1.0 False None False 30000.0
71 0.72 0.328968 1.0 False None False NaN

72 rows × 7 columns


In [10]:
formula = 'accuracy ~ stop_words + use_idf + norm + sublinear_tf + max_n_pages'
model = smf.glm(formula, data=results).fit()
model.summary()


Out[10]:
Generalized Linear Model Regression Results
Dep. Variable: accuracy No. Observations: 48
Model: GLM Df Residuals: 41
Model Family: Gaussian Df Model: 6
Link Function: identity Scale: 0.000536585365854
Method: IRLS Log-Likelihood: 116.40
Date: Tue, 11 Oct 2016 Deviance: 0.022000
Time: 01:13:12 Pearson chi2: 0.0220
No. Iterations: 2
coef std err z P>|z| [0.025 0.975]
Intercept 0.3713 0.009 40.567 0.000 0.353 0.389
use_idf[T.True] 0.0017 0.007 0.249 0.803 -0.011 0.015
norm[T.l1] -0.0250 0.008 -3.053 0.002 -0.041 -0.009
norm[T.l2] 1.509e-16 0.008 1.84e-14 1.000 -0.016 0.016
sublinear_tf[T.True] -0.0017 0.007 -0.249 0.803 -0.015 0.011
stop_words 3.816e-17 0.007 5.71e-15 1.000 -0.013 0.013
max_n_pages 7.346e-06 2.48e-07 29.660 0.000 6.86e-06 7.83e-06

In [ ]: